Python(ML)

1. Min-Max 归一化

此题并不难，主要考察 numpy 的使用，基本上大家都能做出来

也可以用循环迭代代替 numpy

# answer
def normalization(numbers):
    import numpy as np
    numbers = np.array(numbers)
    res = (numbers - numbers.min()) / (numbers.max() - numbers.min())
    return list(res)

2. 34-42-4-28

此题灵感来源于 2023 年全国信息安全竞赛的一道 AI 题，涉及到矩阵的逆向还原，基本上大家都能做出来

用线性代数知识计算得到 A 的表达式后，就可以用 numpy 来计算了，对结果四舍五入可以得到很工整的答案 —— 2023

# answer
import numpy as np

B = np.matrix([
    [34, 42, 4, 28],
])
W = np.matrix([
    [4, 6, 2, 9],
    [9, 3, 5, 1],
    [7, 3, 0, 2],
    [4, 8, 0, 2],
])

A = np.matmul(b, w.I)
A = np.rint(A)
print(A)
# [2, 0, 2, 3]

3. 画个框

从此题开始就有人无法完成了，遇到的障碍有不会从 pandas 读取数据、不知道如何添加框

此题灵感来源于卷积神经网络在多物体识别中的常见操作，想着一是画框确实常用，二是可以借机考察数据读取、计算、绘图的知识，一石三鸟

代码逻辑按照数据的读取、计算、绘图来进行就可以了，依次用到 pandas、matplotlib

# answer
import matplotlib.pyplot as plt
import pandas as pd

data = pd.read_csv("box_data.csv")
x1, y1, x2, y2 = data.loc[0]
xy = (x1, y1)
width = x2 - x1
height = y2 - y1

img  = plt.imread("img.jpg")
rect = plt.Rectangle(xy, width, height, color="red", fill=None)
axes = plt.subplot()
plt.imshow(img)
axes.add_patch(rect)
plt.show()

4. 花花分类与全监督机器学习

此题大概只有 23% 的人能完全做出来，大概有 48% 的人能做出部分 task，线性回归是三个模型中选择最多的思路

部分同学写这题“过于上头”，增添了很多 task 没要求的代码，多少影响了分给第五题完成的时间(

此题考察全监督学习的三个经典模型和数据集预处理的操作，对机器学习的理论掌握和实践能力要求很高

这题的数据我已经提前预处理过，跟原始数据相比已经简单很多了(bushi)

# answer
import numpy as np
import pandas as pd

data = pd.read_csv("flower_data.csv")
data = data.replace(["setosa", "versicolor", "virginica"], [0, 1, 2])
data = data.to_numpy()
inputs = data[:, :4]
labels = data[:, 4]
test   = [[6,2.2,4,1]]

def to_predict(idx):
    temp = ["setosa", "versicolor", "virginica"]
    return temp[idx]

def predict_linear(test)
	from sklearn.linear_model import LinearRegression
	reg = LinearRegression().fit(inputs, labels)
	res = int(reg.predict(test) + 0.5)
	return to_predict(res)

def predict_tree(test)
	from sklearn import tree
	clf = tree.DecisionTreeClassifier()
	clf = clf.fit(inputs, labels)
	res = int(clf.predict(test) + 0.5)
	return to_predict(res)

def predict_svm(test):
	from sklearn import svm
	clf = svm.SVC()
	clf.fit(inputs, labels)
	res = int(clf.predict(test) + 0.5)
	return to_predict(res)

print(predict_linear(test))
print(predict_tree(test))
print(predict_svm(test))

5. 深度学习初步 - 文本情感分类

思绪回到国庆前出题的时候，我需要一个情景来作为深度学习的考核题，最后选择了文本分类，因为这个情景很多神经网络架构都能驾驭

只有 2 人能完全搞定这道题(都是我熟人…)，有 5 人能构建好模型

四种架构中选择最多的是 TextCNN，其次是 DNN、LSTM，没有人选择 Transformer QAQ，也没有人选择混合架构(比如 ViT)

此题其实复杂度并不大，要求只是实现模型即可，却依然有人给出了完整的训练和测试代码，极其离谱

因为我本意是准备询问面试者对模型的理解，一看基本都多写了这么多内容，也不好意思询问了(

# answer pytorch 描述

# DNN 参考思路
# Train-Acc: 94.44% Test-Acc: 74.33%
class model_DNN(nn.Module):
    def __init__(self, vocab_num, embedding_dim, max_seq_length, classify_num):
        super().__init__()
        self.emb = nn.Embedding(vocab_num, embedding_dim)
        self.fc  = nn.Linear(max_seq_length * embedding_dim, classify_num)
        return
    
    def forward(self, x):
        x = self.emb(x)
        x = x.view(x.shape[0], -1)
        x = self.fc(x)
        x = F.softmax(x, dim=1)
        return x

# LSTM 参考思路
# Train-Acc: 95.55% Test-Acc: 81.27%
class model_LSTM(nn.Module):
    def __init__(self, vocab_num, embedding_dim, max_seq_length, classify_num, hidden_size, num_layers):
        super().__init__()
        self.emb  = nn.Embedding(vocab_num, embedding_dim)
        self.lstm = nn.LSTM(input_size=max_seq_length, hidden_size=hidden_size, num_layers=num_layers, batch_first=True, bidirectional=True, dropout=0.5)
        self.fc1  = nn.Linear(hidden_size * 2, hidden_size)
        self.fc2  = nn.Linear(hidden_size, classify_num)
        return
    
    def forward(self, x):
        x = self.emb(x)
        x, (h_n, c_n) = self.lstm(x)
        x = torch.cat((h_n[-1, :, :], h_n[-2, :, :]), dim=-1)
        x = F.relu(self.fc1(x))
        x = F.softmax(self.fc2(x), dim=1)
        return x

# TextCNN 参考思路
# Train-Acc: 98.66% Test-Acc: 84.60%
class IMDB_TextCNN(nn.Module):
    def __init__(self, vocab_num):
        super().__init__()
        self.emb = nn.Embedding(vocab_num, 256)
        self.convs = nn.ModuleList([nn.Conv2d(1, 256, (k, 256)) for k in (11, 7, 5, 3)])
        self.linear = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(256 * 4, 2),
            nn.Softmax(dim=-1),
        )
        return
    
    def cal_conv_pool(self, x, conv):
        x = F.relu(conv(x)).squeeze(3)
        x = F.max_pool1d(x, x.shape[2]).squeeze(2)
        return x
    
    def forward(self, x):
        x = self.emb(x)
        x = x.unsqueeze(1)
        x = torch.cat([self.cal_conv_pool(x, conv) for conv in self.convs], -1)
        x = self.linear(x)
        return x

# Transformer 参考思路
# Train-Acc: 50.00% Test-Acc: 50.00%
class IMDB_Transformer(nn.Module):
    def __init__(self, vocab_num):
        super().__init__()
        self.emb = nn.Embedding(vocab_num, 256)
        self.transformer = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(
                d_model=256,
                nhead=8,
            ),
            num_layers=1,
        )
        self.fc = nn.Sequential(
            nn.Flatten(),
            nn.Linear(256 * 256, 2),
            nn.Softmax(dim=-1),
        )
        return
    
    def forward(self, x):
        x = self.emb(x)
        x = self.transformer(x)
        x = self.fc(x)
        return x